#Dennis moskov, Master Thesis
#Prediction with Random Forest
#conversion, selectivity and yield
#folds by articles
#using "randomForest" package

#install.packages("randomForest")
#library(randomForest)

#randomly shuffle the data
set.seed(77)                      # seed for reproducibility
DBf<-DB[sample(nrow(DB)),]

#initiate possible results
results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(length(DBf)-2,length(DBf)-1,length(DBf)))

#loop through different outcomes
for (r in 1:3) {

#use desired outcome
useDB<-DBf[-c(as.numeric(results[3,-r]))]

#find best number of used variables
nv <- tuneRF(useDB[,-c(1,length(useDB))], useDB[,length(useDB)],stepFactor=1.5, improve=0.005,ntreeTry=500)
bestnv<-nv[which(grepl(min(nv[,2]), nv[,2]))]

#initiate results lists
res<-list()
res.names<-c("number","fold","predicted","observed","residuals","residuals_squared")
resreg<-list()
resreg.names<-c("fitted","observed","residuals","residuals_squared")

#divide data by article number
ref.num.CV <- unique(useDB[,1])

#choose number of folds
k <- 5	

#build folds with unique article numbers
for(j in 1:k){
  if(j<k){
    change.index <- ref.num.CV[trunc(length(ref.num.CV)/k*(j-1)+1):trunc(length(ref.num.CV)/k*j)]
  }else{
    change.index <- ref.num.CV[trunc(length(ref.num.CV)/k*(j-1)+1):(length(ref.num.CV))]
  }
  for(l in 1:length(change.index)){
    useDB[(DBf[,1]==change.index[l]),1] = j
  }
}

#initiate matrices for regression and prediction values
reg<-matrix(, nrow = 10,ncol = k)
colnames(reg, do.NULL = FALSE)
colnames(reg) <-  colnames(reg, do.NULL = FALSE, prefix = "fold ")
rownames(reg) <- c("Sample","RSS","TSS","MSS","R","R","adj.R","MSE","RMSE","SDEC")
predic<-matrix(,nrow=4,ncol=1)
colnames(predic) <-  "overall"
rownames(predic) <- c("PRESS","Q","PSE","SDEP")

#initiate matrix for used variables for tree construction
tvar<-matrix(,length(useDB)-2, k+2)			
rownames(tvar)<-names(useDB[-c(1,length(useDB))])	
colnames(tvar) <-  colnames(tvar, do.NULL = FALSE, prefix = "fold ")
dimnames(tvar)[[2]][(k+1):(k+2)]<-c("overall","std. 100%")

#initiate matrices for used variable importance
impMSE<-matrix(,length(useDB)-2, k+2)			
rownames(impMSE)<-names(useDB[-c(1,length(useDB))])	
colnames(impMSE) <-  colnames(impMSE, do.NULL = FALSE, prefix = "fold ")
dimnames(impMSE)[[2]][(k+1):(k+2)]<-c("overall","std. 100%")
impNode<-matrix(,length(useDB)-2, k+2)			
rownames(impNode)<-names(useDB[-c(1,length(useDB))])	
colnames(impNode) <-  colnames(impNode, do.NULL = FALSE, prefix = "fold ")
dimnames(impNode)[[2]][(k+1):(k+2)]<-c("overall","std. 100%")

#perform k fold cross validation
for(i in 1:k){
   	#segement data by fold  
    	testIndexes <- which(useDB[,1]==i,arr.ind=TRUE)
  	testData <- useDB[testIndexes, ]               #test data fold k
    	testData <- testData[-1]			#remove fold column
    	trainData <- useDB[-testIndexes, ]             #training data fold k
    	trainData <- trainData[-1]			#remove fold column
 
	#fit a random forest
        nt<-500
        fit<-randomForest(trainData[,-length(trainData)],trainData[,length(trainData)],mtry=bestnv,importance=TRUE,ntree=nt)

	pred<-predict(fit,testData, predict.all=FALSE)

	#save results for regression of each fold
    	resreg$fitted<-unname(fit$predicted)
    	resreg$observed<-trainData[,length(trainData)]
   	resreg$residuals<-resreg$observed-resreg$fitted
   	resreg$residuals_squared<-(resreg$residuals)^2

	#save results for prediction of each fold
    	res$number<-c(res$number,as.numeric(names(pred)))
    	res$fold<-c(res$fold,rep(i,length(pred)))
    	res$predicted<-c(res$predicted,unname(pred))

	#save used variables for each fold
	uvn<-varUsed(fit, by.tree=FALSE, count=FALSE)  #variable number
	uv<-varUsed(fit, by.tree=FALSE, count=TRUE)    #variable frequency
	names(uv)<-names(useDB[uvn])		       #variable names
	tvar[,i]<-unname(uv)
	
	#variable importances for each fold
	impMSE[,i] <- importance(fit)[,1]
	impNode[,i] <- importance(fit)[,2]
	
	#values for model fitness for each fold
    	reg["Sample",i]<-nrow(trainData)                 			#number of datapoints in training set of the fold
    	reg["RSS",i]<-sum(resreg$residuals_squared)  				#Residual Sum of Squares
   	reg["TSS",i]<-sum((resreg$observed-mean(resreg$observed))^2)      	#Total Sum of Squares
   	reg["MSS",i]<-reg["TSS",i]-reg["RSS",i]                 		#Model Sum of Squares
    	reg["R",i]<-reg["MSS",i]/reg["TSS",i]                          	#coefficient of determination
    	reg["R",i]<-sqrt(reg["R",i])                           		#multiple correlation coefficient
   	reg["adj.R",i]<-1-((1-reg["R",i])*((reg["Sample",i]-1)/(reg["Sample",i]-length(uv))))                        		
										#Adjusted coefficient of determination
   	reg["MSE",i]<-sum(fit$mse)/length(fit$mse)                        	#Mean square error
   	reg["RMSE",i]<-sqrt(reg["MSE",i])                        		#Root Mean square error (residual standard deviation)
   	reg["SDEC",i]<-sqrt(reg["RSS",i]/reg["Sample",i])       		#Standard Deviation Error in Calculation
}

#average used variables and standardized to 100%
tvar[,k+1]<-rowSums(tvar[,1:k])/k
tvar[,k+2]<-(100/sum(tvar[,k+1]))*tvar[,k+1]

#average variable importances and standardized to 100%
impMSE[,k+1]<-rowSums(impMSE[,1:k])/k
impMSE[,k+2]<-(100/sum(impMSE[,k+1]))*impMSE[,k+1]
impNode[,k+1]<-rowSums(impNode[,1:k])/k
impNode[,k+2]<-(100/sum(impNode[,k+1]))*impNode[,k+1]

#average values for model fitness
reg<-cbind(reg,rowSums(reg)/k)
colnames(reg)[length(colnames(reg))]<-"overall"

#calculate residuals predicted
res$observed<-useDB[,length(useDB)]
res$residuals<-res$observed-res$predicted
res$residuals_squared<-(res$residuals)^2

#evaluation values for goodness of prediction                                     
predic["PRESS",1]<-sum(res$residuals_squared)                            #Predictive Error Sum of Squares
predic["Q",1]<-1-(predic["PRESS",1]/reg["TSS","overall"])               #Cross-validated predictive R2
predic["PSE",1]<-predic["PRESS",1]/length(res$residuals_squared)         #predictive squared error
predic["SDEP",1]<-sqrt(predic["PSE",1])                                  #standard deviation error in prediction




#--------------------------------PLOT------------------------------------------------------------------------------------
#plot predicted vs. observed
x11()
plot(res$predicted,res$observed,pch=16, col=res$fold, xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))
legend("topright", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))

x11()
plot(res$predicted,res$observed,pch=16, col=res$fold,xlim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),ylim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))
legend("topright", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))

#plot predicted vs. residuals
x11() 
plot(res$predicted,res$residuals, col=res$fold, xlim=c(0,1),ylim=c(-1,1),xlab=paste("Predicted MeOH",results[1,r]), xaxs="i",yaxs="i",ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)

x11() 
plot(res$predicted,res$residuals, col=res$fold, xlim=c(min(res$predicted),max(res$predicted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)


#plot residual density
x11()
plot(density(res$residuals),xlab="Residuals", ylab="Density",main=paste("Density Plot of Residuals for",results[1,r]))



#plot histogram of used variables
x11()
par(mar=c(5,4,4,5)+.1)
barplot(rev(sort(tvar[,k+1])),las=2,main=paste("Frequency and Percentage of Variables\nwhich are Used for Forest Construction for MeOH",results[1,r]),ylab="Frequency",ylim=c(0, 1.2*max(tvar[,k+1])))     
par(new=TRUE)
barplot(rev(sort(tvar[,k+2])),xaxt="n",yaxt="n",xlab="",ylab="",ylim=c(0, 1.2*max(tvar[,k+2])))
axis(4)
mtext("Percentage (%)",side=4,line=3)

#plot histogram of variable importance MSE
x11()
par(mar=c(5,4,4,5)+.1)
barplot(rev(sort(impMSE[,k+1])),las=2,main=paste("Variable Importance by % Increase in MSE for MeOH",results[1,r]),ylab="% Increase in MSE",ylim=c(0, 1.2*max(impMSE[,k+1])))     
par(new=TRUE)
barplot(rev(sort(impMSE[,k+2])),xaxt="n",yaxt="n",xlab="",ylab="",ylim=c(0, 1.2*max(impMSE[,k+2])))
axis(4)
mtext("Scaled to 100 %",side=4,line=3)

#plot histogram of variable importance node purity
x11()
par(mar=c(5,4,4,5)+.1)
barplot(rev(sort(impNode[,k+1])),las=2,main=paste("Variable Importance by Increase in Node Purity \nfor MeOH",results[1,r]),ylab="Increase in Node Purity",ylim=c(0, 1.2*max(impNode[,k+1])))     
par(new=TRUE)
barplot(rev(sort(impNode[,k+2])),xaxt="n",yaxt="n",xlab="",ylab="",ylim=c(0, 1.2*max(impNode[,k+2])))
axis(4)
mtext("Scaled to 100 %",side=4,line=3)



View(predic)
View(reg)

#-----------SAVE-----------------------------------------------------------------------------

#regression and prediction results
write.csv(reg, file =paste(results[1,r]," regression_values.csv"))
write.csv(predic, file =paste(results[1,r]," prediction_values.csv"))
write.csv(res, file =paste(results[1,r]," prediction_results.csv"), row.names=FALSE)
write.csv(resreg, file =paste(results[1,r]," regression_results.csv"), row.names=FALSE)
write.csv(tvar,file=paste(results[1,r]," Variables_Forest_Construction.csv"))

#plot predicted vs. observed
png(filename=paste(results[1,r]," predVSobs full.png"))
par(new=TRUE, pch=16)
plot(res$predicted,res$observed,pch=16, col=res$fold,xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))
legend("topright", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))
dev.off()

png(filename=paste(results[1,r]," predVSobs cropped.png"))
par(new=TRUE, pch=16)
plot(res$predicted,res$observed,pch=16, col=res$fold,xlim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),ylim=c(min(0,min(res$predicted)),max(max(res$observed),max(res$predicted))),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Predicted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(1:k), lty ="longdash",title = "folds",col=unique(res$fold))
legend("topright", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))
dev.off()


#plot predicted vs. residuals
png(filename=paste(results[1,r]," predVSres full.png"))
plot(res$predicted,res$residuals, col=res$fold,xlim=c(0,1),ylim=c(-1,1),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

png(filename=paste(results[1,r]," predVSres cropped.png"))
plot(res$predicted,res$residuals, col=res$fold,xlim=c(min(res$predicted),max(res$predicted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Predicted MeOH",results[1,r]), ylab="Residuals",main=paste("Predicted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

#plot residual density
png(filename=paste(results[1,r]," resDensity.png"))
plot(density(res$residuals),xlab="Residuals", ylab="Density",main=paste("Density Plot of Residuals for",results[1,r]))
dev.off()



#plot histogram of used variables
png(filename=paste(results[1,r],"UsedVar.png"))
par(mar=c(5,4,4,5)+.1)
barplot(rev(sort(tvar[,k+1])),las=2,main=paste("Frequency and Percentage of Variables\nwhich are Used for Forest Construction for MeOH",results[1,r]),ylab="Frequency",ylim=c(0, 1.2*max(tvar[,k+1])))     
par(new=TRUE)
barplot(rev(sort(tvar[,k+2])),xaxt="n",yaxt="n",xlab="",ylab="",ylim=c(0, 1.2*max(tvar[,k+2])))
axis(4)
mtext("Percentage (%)",side=4,line=3)
dev.off()

#plot histogram of variable importance MSE
png(filename=paste(results[1,r],"ImpVarMSE.png"))
par(mar=c(5,4,4,5)+.1)
barplot(rev(sort(impMSE[,k+1])),las=2,main=paste("Variable Importance by % Increase in MSE for MeOH",results[1,r]),ylab="% Increase in MSE",ylim=c(0, 1.2*max(impMSE[,k+1])))     
par(new=TRUE)
barplot(rev(sort(impMSE[,k+2])),xaxt="n",yaxt="n",xlab="",ylab="",ylim=c(0, 1.2*max(impMSE[,k+2])))
axis(4)
mtext("Scaled to 100 %",side=4,line=3)
dev.off()

#plot histogram of variable importance node purity
png(filename=paste(results[1,r],"ImpVarNode.png"))
par(mar=c(5,4,4,5)+.1)
barplot(rev(sort(impNode[,k+1])),las=2,main=paste("Variable Importance by Increase in Node Purity \nfor MeOH",results[1,r]),ylab="Increase in Node Purity",ylim=c(0, 1.2*max(impNode[,k+1])))     
par(new=TRUE)
barplot(rev(sort(impNode[,k+2])),xaxt="n",yaxt="n",xlab="",ylab="",ylim=c(0, 1.2*max(impNode[,k+2])))
axis(4)
mtext("Scaled to 100 %",side=4,line=3)
dev.off()


}







